In [2]:
Copied!
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import pandas as pd
import numpy as np
c:\Users\ericl\miniconda3\lib\site-packages\tqdm\auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
In [55]:
Copied!
df = pd.read_csv('train.csv')
df = pd.read_csv('train.csv')
In [11]:
Copied!
df.head()
df.head()
Out[11]:
| raw_row_number | location | county_name | subject_age | subject_race | subject_sex | officer_id_hash | department_name | type | arrest_made | ... | outcome | frisk_performed | search_conducted | search_person | search_vehicle | reason_for_stop | raw_Ethnicity | raw_Race | raw_action_description | date_time | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12511107 | NaN | forsyth county | 18.0 | white | male | f2f6b08c97 | Winston-Salem Police Department | vehicular | False | ... | citation | False | False | False | False | Speed Limit Violation | N | W | Citation Issued | 2010-12-05 01:51:24 |
| 1 | 5439683 | raleigh | wake county | 25.0 | hispanic | male | 1e3fa73f20 | Raleigh Police Department | vehicular | False | ... | warning | False | True | True | True | Vehicle Regulatory Violation | H | W | Verbal Warning | 2005-09-25 03:40:00 |
| 2 | 18674698 | charlotte area | mecklenburg county | 30.0 | black | female | 59a754eb04 | Charlotte-Mecklenburg Police Department | vehicular | False | ... | warning | False | False | False | False | Speed Limit Violation | N | B | Verbal Warning | 2014-11-15 02:00:00 |
| 3 | 12600300 | charlotte area | mecklenburg county | 21.0 | white | male | 0dc507ea69 | Charlotte-Mecklenburg Police Department | vehicular | False | ... | warning | False | False | False | False | Vehicle Regulatory Violation | N | W | Verbal Warning | 2011-01-23 00:16:00 |
| 4 | 6035053 | NaN | durham county | 38.0 | black | female | 91822b2dfe | Durham Police Department | vehicular | False | ... | citation | False | False | False | False | Speed Limit Violation | N | B | Citation Issued | 2006-06-18 10:17:17 |
5 rows × 22 columns
In [7]:
Copied!
profile = ProfileReport(df, title="Police Stop Profiling Report")
profile
profile = ProfileReport(df, title="Police Stop Profiling Report")
profile
Summarize dataset: 100%|██████████| 41/41 [06:19<00:00, 9.25s/it, Completed] Generate report structure: 100%|██████████| 1/1 [00:07<00:00, 7.29s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.73s/it]
Out[7]:
In [56]:
Copied!
#Getting rid of unneccessary columns
df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
#Getting rid of unneccessary columns
df["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
df.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
In [57]:
Copied!
#Change reason for stop and boolean columns to numerical
df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1)
df.drop(columns=['reason_for_stop'], inplace=True)
df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
df['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
df['search_conducted'].replace({True: 1, False: 0}, inplace=True)
df['search_person'].replace({True: 1, False: 0}, inplace=True)
df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#Change reason for stop and boolean columns to numerical
df = pd.concat([df, df['reason_for_stop'].str.get_dummies()], axis=1)
df.drop(columns=['reason_for_stop'], inplace=True)
df['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
df['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
df['search_conducted'].replace({True: 1, False: 0}, inplace=True)
df['search_person'].replace({True: 1, False: 0}, inplace=True)
df['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [76]:
Copied!
ProfileReport(df, title='Useful Profile Report')
ProfileReport(df, title='Useful Profile Report')
Summarize dataset: 100%|██████████| 32/32 [09:45<00:00, 18.29s/it, Completed] Generate report structure: 100%|██████████| 1/1 [00:09<00:00, 9.28s/it] Render HTML: 100%|██████████| 1/1 [00:01<00:00, 1.35s/it]
Out[76]:
In [66]:
Copied!
#read test files and also perform the same changes
test = pd.read_csv('test.csv')
test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1)
test.drop(columns=['reason_for_stop'], inplace=True)
test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
test['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
test['search_conducted'].replace({True: 1, False: 0}, inplace=True)
test['search_person'].replace({True: 1, False: 0}, inplace=True)
test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
#read test files and also perform the same changes
test = pd.read_csv('test.csv')
test["outcome"].replace(['warning', 'citation', 'arrest'], [0, 1, 2], inplace=True)
test.drop(columns=['date_time', 'department_name', 'county_name', 'type', 'raw_Race', 'raw_Ethnicity', 'raw_action_description', 'subject_race', 'location', 'raw_row_number', 'officer_id_hash', 'arrest_made', 'citation_issued', 'warning_issued'], inplace=True)
test = pd.concat([test, test['reason_for_stop'].str.get_dummies()], axis=1)
test.drop(columns=['reason_for_stop'], inplace=True)
test['subject_sex'].replace({'male': 1, 'female': 0}, inplace=True)
test['frisk_performed'].replace({True: 1, False: 0}, inplace=True)
test['search_conducted'].replace({True: 1, False: 0}, inplace=True)
test['search_person'].replace({True: 1, False: 0}, inplace=True)
test['search_vehicle'].replace({True: 1, False: 0}, inplace=True)
In [71]:
Copied!
#Seperate data for modeling
X_train = df.drop('outcome', axis=1)
X_test = test.drop('outcome', axis=1)
Y_train = df['outcome']
Y_test = test['outcome']
#Seperate data for modeling
X_train = df.drop('outcome', axis=1)
X_test = test.drop('outcome', axis=1)
Y_train = df['outcome']
Y_test = test['outcome']
In [73]:
Copied!
from sklearn.linear_model import LogisticRegression
#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
from sklearn.linear_model import LogisticRegression
#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
In [75]:
Copied!
#Accuracy
np.mean(Y_test==Y_pred)
#Accuracy
np.mean(Y_test==Y_pred)
Out[75]:
0.661227829379674
Not Very Accurate!
In [78]:
Copied!
#Try with more relavent columns
X_train = df[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
X_test = test[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
Y_train = df['outcome']
Y_test = test['outcome']
#Try with more relavent columns
X_train = df[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
X_test = test[['search_conducted', 'Vehicle Equipment Violation', 'Driving While Impaired', 'Safe Movement Violation', 'Speed Limit Violation']]
Y_train = df['outcome']
Y_test = test['outcome']
In [79]:
Copied!
#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
#Accuracy
np.mean(Y_test==Y_pred)
#Make the model
model = LogisticRegression(solver = 'liblinear')
model.fit(X_train,Y_train)
Y_pred = model.predict(X_test)
#Accuracy
np.mean(Y_test==Y_pred)
Out[79]:
0.6585383071749138